library(tidyverse)
library(janitor)
library(knitr)
library(tidyr)
library(readxl)
library(lubridate)
library(readr)
library(stringr)
library(ggplot2)
library(tsibble)
library(dbplyr)
library(stats)
library(plotly)
library(ggridges)
library(forecast)
library(patchwork)
library(dplyr)
library(flexdashboard)
library(leaflet)
library(shiny)
library(leaflet.providers)
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
fig.width = 8,
fig.height = 7,
out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_color_viridis_d
scale_fill_discrete = scale_fill_viridis_d
We like many New Yorkers see rats as a major problem that has only worsened following the pandemic. New York City government agrees and has implemented and promoted a flashy new initiative they are calling “Send Rats Packing.” https://www.nyc.gov/assets/queenscb2/downloads/pdf/notices/2023/SetoutTimes-Residential-Flyer-2023-02.pdf This initiative is mainly composed of a new rule involving trash that aims to reduce the time that trash, recycling, and curbside composting will sit on the sidewalk. The new rule went into effect on April 1, 2023 and left Residential buildings and their managers with two options -Place waste out after 6:00 PM in a container of 55 gallons or less with a secure lid or Place waste out after 8:00 PM, if putting bags directly on the curb. Although everyone wants to see rats gone not everyone is on board with this new rule and many question if it is actually going to result in less rats. The NYC Building Supers an organization composed of building maintenance workers like porters and supers across the 5 boroughs has called this rule “outrageous and unfair” which requires them to work 14 hour day just to comply with the new rule. They have banded together to strike this rule by engaging in city hall protests and acts of civil disobedience by not complying with the new trash time. https://nycbuildingsupers.com/ Given this backdrop and the general skepticism of the effectiveness of the measure we were motivated to explore whether or not this trash time is effective at reducing the presence of rats across the 5 boroughs. ## Cleaning rat sightings dataset
rat_sightings =
read_csv ("./data/Rat_Sightings.csv") |>
janitor::clean_names(case = "snake") |>
separate(created_date, sep="/", into = c("month", "day", "year")) |>
separate(year, sep=" ", into = c("year")) |>
filter(borough != "STATEN ISLAND") |>
filter(year %in% c("2019", "2020", "2021", "2022", "2023")) |>
mutate(
borough_id = recode(
borough,
"MANHATTAN" = 1,
"BRONX" =2,
"BROOKLYN"=3,
"QUEENS"= 4)) |>
mutate(
month = as.numeric(month),
year = as.numeric(year)
) |>
select(unique_key, month, day, year, location_type, incident_zip, borough, location, borough_id) |>
mutate(
borough = str_to_sentence(borough)
)
waste_tonnage = read_csv("./data/DSNY_Monthly_Tonnage_Data_20231202.csv") %>%
clean_names(case = "snake") %>%
mutate(date_split = strsplit(month, "/")) %>%
mutate(
year = as.integer(sapply(date_split, function(x) x[1])),
month = as.integer(sapply(date_split, function(x) x[2]))
) %>%
filter(year %in% c(2022, 2023)) %>%
mutate(total_organics = resorganicstons + schoolorganictons)
## Rows: 23296 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): MONTH, BOROUGH, COMMUNITYDISTRICT
## dbl (8): REFUSETONSCOLLECTED, PAPERTONSCOLLECTED, MGPTONSCOLLECTED, RESORGAN...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
waste_tonnage = waste_tonnage %>%
group_by(borough, month, year, borough_id) %>%
summarise(
total_organics = sum(total_organics, na.rm = TRUE),
total_refuse = sum(refusetonscollected, na.rm = TRUE)
)
## `summarise()` has grouped output by 'borough', 'month', 'year'. You can
## override using the `.groups` argument.
rat_waste_merged = left_join(rat_sightings, waste_tonnage, by = c("borough_id", "month", "year", "borough"))
rat_waste_filtered = subset(rat_waste_merged, !is.na(total_organics) & !is.na(total_refuse))
waste_tonnage_by_borough = rat_waste_filtered |>
group_by(borough) |>
summarise(total_tonnage = sum(total_organics + total_refuse))
ggplot(waste_tonnage_by_borough, aes(x = reorder(borough, -total_tonnage), y = total_tonnage)) +
geom_bar(stat = "identity", fill = "blue") +
labs(x = "Borough", y = "Trash Tonnage", title = "Trash Tonnage by Borough, 2022 - 2023") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
waste_by_borough_refuse <- aggregate(total_refuse ~ borough, data=waste_tonnage, sum)
# barplot for total refuse
barplot(waste_by_borough_refuse$total_refuse,
main="Total Refuse Production by Borough",
xlab="Borough",
ylab="Total Refuse (tons)",
col="blue",
names.arg=waste_by_borough_refuse$borough,
las=2)
waste_by_borough_organics <- aggregate(total_organics ~ borough, data=waste_tonnage, sum)
# barplot for total organics
barplot(waste_by_borough_organics$total_organics,
main="Total Organics Production by Borough",
xlab="Borough",
ylab="Total Organics (tons)",
col="green",
names.arg=waste_by_borough_organics$borough,
las=2)
total_tonnage_by_borough_year_month = rat_waste_filtered |>
group_by(borough, year, month) |>
summarise(total_tonnage = sum(total_organics + total_refuse))
## `summarise()` has grouped output by 'borough', 'year'. You can override using
## the `.groups` argument.
total_tonnage_by_borough_year_month$year_month <- as.Date(paste(total_tonnage_by_borough_year_month$year, total_tonnage_by_borough_year_month$month, "01", sep = "-"))
ggplot(total_tonnage_by_borough_year_month, aes(x = year_month, y = total_tonnage, group = borough, color = borough)) +
geom_line() +
labs(x = "Year-Month", y = "Amount of Trash (in tons)", title = "Trash Tonnage by Month and Borough") +
scale_x_date(date_labels = "%Y-%m", date_breaks = "1 month") +
scale_y_continuous(labels = scales::number_format(scale = 1e-6)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
anova_result_organics = aov(total_organics ~ borough, data = waste_tonnage) |> broom::tidy()
anova_result_refuse = aov(total_refuse ~ borough, data = waste_tonnage) |> broom::tidy()
The one-way ANOVA test was performed to assess the differences in organic waste tonnage among the boroughs. The ANOVA test revealed statistically significant differences in organic waste tonnage among the boroughs (F(4, 112) = 10.5, p < 0.05).
A one-way ANOVA test was conducted to examine the differences in refuse waste tonnage among the boroughs.The ANOVA test revealed statistically significant differences in refuse waste tonnage among the boroughs (F(4, 112) = 82.1, p < 0.001).
waste_tonnage = read_csv("./data/DSNY_Monthly_Tonnage_Data_20231202.csv") %>%
clean_names(case = "snake") %>%
mutate(date_split = strsplit(month, "/")) %>%
mutate(
year = as.integer(sapply(date_split, function(x) x[1])),
month = as.integer(sapply(date_split, function(x) x[2]))
) %>%
filter(year %in% c(2022, 2023)) %>%
mutate(total_organics = resorganicstons + schoolorganictons)
## Rows: 23296 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): MONTH, BOROUGH, COMMUNITYDISTRICT
## dbl (8): REFUSETONSCOLLECTED, PAPERTONSCOLLECTED, MGPTONSCOLLECTED, RESORGAN...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
waste_tonnage = waste_tonnage %>%
group_by(borough, month, year, borough_id) %>%
summarise(
total_organics = sum(total_organics, na.rm = TRUE),
total_refuse = sum(refusetonscollected, na.rm = TRUE)
)
## `summarise()` has grouped output by 'borough', 'month', 'year'. You can
## override using the `.groups` argument.
rat_waste_merged = left_join(rat_sightings, waste_tonnage, by = c("borough_id", "month", "year", "borough"))
rat_sightings =
read_csv ("./data/Rat_Sightings.csv") |>
janitor::clean_names(case = "snake") |>
separate(created_date, sep="/", into = c("month", "day", "year")) |>
separate(year, sep=" ", into = c("year")) |>
filter(borough != "STATEN ISLAND") |>
filter(year %in% c("2019", "2020", "2021", "2022", "2023")) |>
mutate(
borough_id = recode(
borough,
"MANHATTAN" = 1,
"BRONX" =2,
"BROOKLYN"=3,
"QUEENS"= 4)) |>
mutate(
month = as.numeric(month),
year = as.numeric(year)
) |>
select(unique_key, month, day, year, location_type, incident_zip, borough, location, borough_id) |>
mutate(
borough = str_to_sentence(borough)
)
## Rows: 232625 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (25): Created Date, Closed Date, Agency, Agency Name, Complaint Type, De...
## dbl (6): Unique Key, Incident Zip, X Coordinate (State Plane), Y Coordinate...
## lgl (7): Vehicle Type, Taxi Company Borough, Taxi Pick Up Location, Bridge ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(rat_sightings)
## unique_key month day year
## Min. :41310910 Min. : 1.000 Length:105186 Min. :2019
## 1st Qu.:47382874 1st Qu.: 4.000 Class :character 1st Qu.:2020
## Median :52406739 Median : 7.000 Mode :character Median :2021
## Mean :51687699 Mean : 6.543 Mean :2021
## 3rd Qu.:55862392 3rd Qu.: 9.000 3rd Qu.:2022
## Max. :59604845 Max. :12.000 Max. :2023
##
## location_type incident_zip borough location
## Length:105186 Min. :10000 Length:105186 Length:105186
## Class :character 1st Qu.:10038 Class :character Class :character
## Mode :character Median :11205 Mode :character Mode :character
## Mean :10783
## 3rd Qu.:11226
## Max. :12345
## NA's :31
## borough_id
## Min. :1.00
## 1st Qu.:1.00
## Median :3.00
## Mean :2.44
## 3rd Qu.:3.00
## Max. :4.00
## NA's :21
variable_types <- sapply(rat_sightings, class)
print(variable_types)
## unique_key month day year location_type
## "numeric" "numeric" "character" "numeric" "character"
## incident_zip borough location borough_id
## "numeric" "character" "character" "numeric"
# variables are not classified well for analysis so will need to convert numeric variables
numeric_vars_to_convert <- c("unique_key", "month", "year", "incident_zip", "borough_id")
rat_sightings <- rat_sightings %>%
mutate(across(all_of(numeric_vars_to_convert), as.factor))
variable_types <- sapply(rat_sightings, class)
print(variable_types)
## unique_key month day year location_type
## "factor" "factor" "character" "factor" "character"
## incident_zip borough location borough_id
## "factor" "character" "character" "factor"
# number of rat sightings by boro each year
rats_boro = rat_sightings %>%
janitor::clean_names() %>%
select(borough, year, unique_key) %>%
group_by(borough, year) %>%
count() %>%
summarize(avg_rat_sightings = mean(n)) %>%
ungroup %>%
spread(key = year, value = avg_rat_sightings) %>%
filter(borough != 'Unspecified')# I want to remove the unsepcified
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
knitr::kable(rats_boro)
| borough | 2019 | 2020 | 2021 | 2022 | 2023 |
|---|---|---|---|---|---|
| Bronx | 2789 | 2762 | 4469 | 4127 | 3378 |
| Brooklyn | 6414 | 5995 | 9500 | 10122 | 9597 |
| Manhattan | 4318 | 4180 | 6947 | 7455 | 6232 |
| Queens | 2522 | 2711 | 3400 | 4092 | 4155 |
We can see from the kable output that there was quite a substantial jump in rat sightings from 2020 to 2021 in all of the boroughs. This may be another COVID phenomena as restaurants shifted to more outdoor dining which deposited more food waste and other things that attract rats onto the streets during the pandemic and after the pandemic when indoor dining became less feasible. https://apnews.com/article/rats-new-york-9dc65afa66a3535cba01b1ea866973a1#:~:text=NEW%20YORK%20(AP)%20%E2%80%94%20They,so%20did%20the%20city’s%20rats.
# I will test the statistical difference of average rat sighting across boroughs and across time.
rat_sightings_agg = rat_sightings |>
group_by(year, borough, month) |>
filter(borough != "Unspecified") %>%
summarise(count = n())
## `summarise()` has grouped output by 'year', 'borough'. You can override using
## the `.groups` argument.
anova_result = aov(count ~ factor(year) * factor(borough), data = rat_sightings_agg) |> broom::tidy()
anova_result_no_interaction = aov(count ~ factor(year) + factor(borough), data = rat_sightings_agg) |> broom::tidy()
# Print the summary to get F-statistic and p-value
anova1_summary <- summary(anova_result)
knitr::kable(anova1_summary)
| term | df | sumsq | meansq | statistic | p.value | |
|---|---|---|---|---|---|---|
| Length:4 | Min. : 3.00 | Min. : 548327 | Min. : 16574 | Min. : 2.757 | Min. :0.0000000 | |
| Class :character | 1st Qu.: 3.75 | 1st Qu.:1776348 | 1st Qu.: 38414 | 1st Qu.: 17.863 | 1st Qu.:0.0000000 | |
| Mode :character | Median : 8.00 | Median :2882831 | Median : 296058 | Median : 32.969 | Median :0.0000000 | |
| NA | Mean : 58.75 | Mean :3310296 | Mean : 729439 | Mean : 58.348 | Mean :0.0005540 | |
| NA | 3rd Qu.: 63.00 | 3rd Qu.:4416779 | 3rd Qu.: 987083 | 3rd Qu.: 86.144 | 3rd Qu.:0.0008311 | |
| NA | Max. :216.00 | Max. :6927195 | Max. :2309065 | Max. :139.319 | Max. :0.0016621 | |
| NA | NA | NA | NA | NA’s :1 | NA’s :1 |
anova2_summary <- summary(anova_result_no_interaction)
knitr::kable(anova2_summary)
| term | df | sumsq | meansq | statistic | p.value | |
|---|---|---|---|---|---|---|
| Length:3 | Min. : 3.00 | Min. :2185688 | Min. : 18107 | Min. : 30.18 | Min. :0 | |
| Class :character | 1st Qu.: 3.50 | 1st Qu.:3156994 | 1st Qu.: 282264 | 1st Qu.: 54.52 | 1st Qu.:0 | |
| Mode :character | Median : 4.00 | Median :4128300 | Median : 546422 | Median : 78.85 | Median :0 | |
| NA | Mean : 78.33 | Mean :4413728 | Mean : 957865 | Mean : 78.85 | Mean :0 | |
| NA | 3rd Qu.:116.00 | 3rd Qu.:5527748 | 3rd Qu.:1427744 | 3rd Qu.:103.19 | 3rd Qu.:0 | |
| NA | Max. :228.00 | Max. :6927195 | Max. :2309065 | Max. :127.53 | Max. :0 | |
| NA | NA | NA | NA | NA’s :1 | NA’s :1 |
From the ANOVA tests above we can see from the p-value of both tests being <0.001 we can reject the null hypotheses and conclude that at least one of the average rat sightings by borough and by year are statistically different.
rat_sightings_agg <- rat_sightings_agg %>%
mutate(common_date = paste0(year, "-", month),
common_date = lubridate::ym(common_date))
# Plot rat sightings over time
rats_yr_plot <- rat_sightings_agg %>%
ggplot(aes(x = common_date, y = count, group = year, color = year)) +
geom_point(alpha = .3) +
geom_smooth(se = FALSE) +
facet_wrap(~ borough, scales = "free_y", ncol = 2) + # Facet wrap by borough
labs(
title = "Rat Sightings Over Time",
x = "Date",
y = "Rat Sightings Count") +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 60, hjust = 1, size = 10)) +
scale_colour_discrete("Year") +
scale_x_date(date_breaks = "3 month", labels = function(x) format(x, "%b")) +
scale_color_viridis_d(end = .8)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
print(rats_yr_plot)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplotly(rats_yr_plot)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
rat_sightings_tsibble <- rat_sightings_agg %>%
mutate(common_date = paste0(year, "-", month),
common_date = lubridate::ym(common_date)) %>%
as_tsibble(index = common_date, key = borough) %>%
select(borough, common_date, count) # Include common_date in the select statement
## Adding missing grouping variables: `year`
# Time series plot
rats_time_series_plot <- rat_sightings_tsibble %>%
ggplot(aes(x = common_date, y = count, color = borough, group = borough)) +
geom_line() +
labs(title = "Rat Sightings Time Series",
x = "Date",
y = "Rat Sightings Count",
color = "Borough") +
theme_minimal()
# Print the plot
print(rats_time_series_plot)
ggplotly(rats_time_series_plot)
borough_data <- rat_sightings_tsibble %>%
filter(borough == "Brooklyn")
# Convert data to a univariate time series
ts_data <- ts(borough_data$count, frequency = 12)
# Time series analysis using STL decomposition
ts_analysis <- stlf(ts_data, h = 12)
# Plot time series decomposition
autoplot(ts_analysis) +
labs(title = "Time Series Decomposition",
y = "Rat Sightings Count",
color = "Components") +
theme_minimal()
# Obtain forecasts
forecast_result <- ts_analysis %>%
forecast(h = 12)
# Plot forecasts
autoplot(forecast_result) +
labs(title = "Rat Sightings Forecast",
x = "Date",
y = "Rat Sightings Count") +
theme_minimal()
rat_sightings_tsibble2 <- rat_sightings_agg %>%
mutate(common_date = paste0(year, "-", month, "-01"),
common_date = lubridate::ymd(common_date),
rule_impact = ifelse(common_date >= "2023-04-01", "After", "Before")) %>%
as_tsibble(index = common_date, key = c(borough, rule_impact)) %>%
select(borough, common_date, count, rule_impact)
## Adding missing grouping variables: `year`
# Plot to analyze the impact of the rule
rats_rule_impact_plot <- rat_sightings_tsibble2 %>%
ggplot(aes(x = common_date, y = count, color = rule_impact, group = interaction(borough, rule_impact))) +
geom_line() +
labs(title = "Impact of Rat Extermination Rule",
x = "Date",
y = "Rat Sightings Count",
color = "Rule Impact",
subtitle = "Comparison Before and After April 2013") +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 60, hjust = 1, size = 10)) +
scale_colour_discrete("Year") +
scale_x_date(date_breaks = "3 month", labels = function(x) format(x, "%b")) +
scale_color_viridis_d(end = .8)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Print the plot
print(rats_rule_impact_plot)
ggplotly(rats_rule_impact_plot)
viz1_data = rats_boro %>%
pivot_longer(cols = starts_with("20"),
names_to = "Year",
values_to = "avg_rat_sightings"
)
ggplot(viz1_data, aes(x = Year, y = avg_rat_sightings)) +
geom_point(alpha = 0.3, size = 2) +
geom_line(size = 1, alpha = 0.6) +
facet_wrap(~borough, scales = "free_y") +
theme(legend.position = "bottom",
axis.text.y = element_text(color = "black",
size = 10, hjust = 1),
axis.text.x = element_text(angle = 45,
hjust = 1, size = 10)) +
labs(
x = "Year",
y = "Average Rat Sightings",
title = "Average Rate Sightings From 2019-2023 by Borough"
) +
viridis::scale_colour_viridis()
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
# I cannot seem to get the line to form so I will try it with a different data format
ggplot(rat_sightings_agg, aes(x = year, y = count)) +
geom_point(alpha = 0.3, size = 2) +
geom_line(size = 1, alpha = 0.6) +
facet_wrap(~borough, scales = "free_y") +
theme(legend.position = "bottom",
axis.text.y = element_text(color = "black",
size = 10, hjust = 1),
axis.text.x = element_text(angle = 45,
hjust = 1, size = 10)) +
labs(
x = "Year",
y = "Average Rat Sightings",
title = "Average Rate Sightings From 2019-2023 by Borough"
) +
viridis::scale_colour_viridis()
# This graph is also not what I had in mind